
#### Preliminaries ####

R.version$version.string

# clear workspace
rm(list=ls())

# load packages
library(here)
library(tidyverse)
library(rworldmap) # maps
library(magrittr)
library(reshape2)
library(ggpubr)
library(leaflet)
library(plotly)
library(gapminder)
library(countrycode)
library(ggrepel)
library(ggthemes)
library(modelsummary)
library(fixest)
library(marginaleffects)
library(estimatr)
library(brglm)
library(GJRM)
library(lmtest)
library(sandwich)
library(zoo)



ggplot_theme <- theme(axis.text=element_text(size=16),
                      axis.title=element_text(size=16,face="bold"), 
                      title =element_text(size=16, face='bold'))

legend_size_theme <-   theme(legend.key.size = unit(1, 'cm'), #change legend key size
                             legend.key.height = unit(1, 'cm'), #change legend key height
                             legend.key.width = unit(1, 'cm'), #change legend key width
                             legend.title = element_text(size=16), #change legend title font size
                             legend.text = element_text(size=16)) #change legend text font size 


#### set the working directory, if not using the Rproject #####

#---------------------------------------------------#
#--------------------load data ---------------------#
#---------------------------------------------------#

# 1. Load V-Dem data (version 14)

vdem_full_v14 <- readRDS("data/vdem14/V-Dem-CY-Full+Others-v14.rds")

vdem_subset <- vdem_full_v14 %>%
  dplyr::select(country_name, year, country_text_id, COWcode, country_id, starts_with("v2x_polyarchy"),
                starts_with("v2xca_academ"), starts_with("v2cafres"), starts_with("v2cafexch"), 
                starts_with("v2cainsaut"), starts_with("v2casurv"), 
                starts_with("v2clacfree"), e_pop, e_pop_sd, e_gdppc, e_gdppc_sd,  starts_with("v2x_liberal"), 
                v2x_regime, e_regionpol_6C, e_regionpol, v2canuni, v2caprotac, v2x_veracc, v2x_horacc, v2x_diagacc, 
                v2lgbicam, v2juhcind, v2jureview, v2exl_legitideol, v2exl_legitperf, v2exl_legitratio, 
                v2cacamps,e_boix_regime, v2petersch ) %>%
  filter(year >=1899)


episode_data <- read.csv("data/episode_data/episode_without_uncertainty_interval.csv")

episode_data <- episode_data %>%
  dplyr::select(-c(X, starts_with("v2xca_academ"), country_id, country_name))

summary(episode_data)
length(table(episode_data$decline_episode_id)) -1# 198 episodes

ert_data <- read.csv("data/ert_v14.csv")

ert_data <- ert_data %>%
  dplyr::select(-c(X, starts_with("v2x_polyarchy"), v2x_regime, country_id, country_name))


#---------------------------------------------------#
#--------------------merge data --------------------#
#---------------------------------------------------#

vdem_subset <- vdem_subset %>%
  left_join(episode_data, by = c("country_text_id", "year")) %>%
  left_join(ert_data, by = c("country_text_id", "year")) %>%
  group_by(country_text_id) %>%
  mutate(v2xca_academ_lag = lag(v2xca_academ)) 

vdem_subset <- vdem_subset %>%
  group_by(country_text_id) %>%
  mutate(aut_ep_lag_1 = lag(aut_ep), 
         aut_ep_lag_2 = lag(aut_ep, 2),
         aut_ep_lag_3 = lag(aut_ep, 3),
         aut_ep_lag_4 = lag(aut_ep, 4),
         aut_ep_lag_5 = lag(aut_ep, 5),
         aut_ep_lag_1_3 = ifelse(aut_ep_lag_1 == 1 | aut_ep_lag_2 == 1 | aut_ep_lag_3 == 1, 1, 0), 
         aut_ep_lag_1_5 = ifelse(aut_ep_lag_1 == 1 | aut_ep_lag_2 == 1 | aut_ep_lag_3 == 1 | aut_ep_lag_4 == 1 | aut_ep_lag_5 == 1, 1, 0)) %>%
  mutate(aut_ep_lag_1_3 = ifelse(aut_ep_lag_1_3 == 0 & aut_ep == 1, 1, aut_ep_lag_1_3), 
         aut_ep_lag_1_5 = ifelse(aut_ep_lag_1_5 == 0 & aut_ep == 1, 1, aut_ep_lag_1_5)) 

test_data <- vdem_subset %>%
  dplyr::select(country_text_id, year, aut_ep_id, aut_ep, aut_ep_lag_1, aut_ep_lag_1_3, aut_ep_lag_1_5 )

rm(test_data)

################################################################################
################################################################################


epsm_data <- readxl::read_xlsx("data/EPSM dataset/EPSM_dataset_clean.xlsx")

epsm_data <- epsm_data %>%
  dplyr::select(-c(country_name, country_id, project, historical, histname, gapstart1, gapstart2, gapstart3, 
                   gapend1, gapend2, gapend3, COWcode))

table(epsm_data$operate_high)
table(epsm_data$edu_dep)

vdem_subset <- vdem_subset %>%
  left_join(epsm_data, by = c("country_text_id", "year"))

vdem_subset <- vdem_subset %>%
  mutate(edu_dep = ifelse(edu_dep == 1, 0, 1)) # 0 = no centralized education department at national level, 1 = national level education department
table(vdem_subset$edu_dep)

vdem_subset <- vdem_subset %>%
  mutate(operate_high_simple = ifelse(operate_high == 1 | operate_high == 2 , "subnational", 
                                      ifelse(operate_high == 3, "national", 
                                             ifelse(operate_high == 10, "subnational", 
                                                    ifelse(operate_high == 9 | operate_high == 4 | operate_high == 5 | operate_high == 6 | 
                                                             operate_high == 7, "private organization", "Private and Public")))))

table(vdem_subset$operate_high_simple)


#---------------------------------------------------#
#------------ Stock Variable Functions--------------#
#---------------------------------------------------#


decay_sum <- function(tm, vl, decay) {
  last_time <- 0
  current_sum <- 0
  sums <- numeric(length(vl))
  ldecay <- (1-decay)
  for (i in 1:length(vl)) {
    delta <- as.numeric(tm[i] - last_time)
    current_sum <- current_sum * (ldecay * delta) + vl[i]
    last_time <- tm[i]
    sums[i] <- current_sum
  }
  sums
}

annual_depricition_function  <- function(year, var, decay) {
  last_time <- first(year)
  current_sum <- 0
  sums <- numeric(length(var))
  ldecay <- (1-decay)
  for (i in 1:length(var)) {
    delta <- as.numeric(year[i] - last_time)
    current_sum <- ldecay * (current_sum + decay * var[i])
    last_time <- year[i]
    sums[i] <- current_sum
  }
  sums
}


## Functions ##

na_interpolation2 <- function(x, option = "linear", maxgap) {
  library(imputeTS)
  library(dplyr)
  
  total_not_missing <- sum(!is.na(x))
  
  # check there is sufficient data for na_interpolation 
  if(total_not_missing < 2) {x} 
  
  else
    
    # replace takes an input vector, a T/F vector & replacement value
  {replace(
    # input vector is interpolated data
    # this will impute leading/lagging NAs which we don't want 
    imputeTS::na_interpolation(x, option = option, maxgap = maxgap), 
    
    # create T/F vector for NAs,  
    is.na(na.approx(x, na.rm = FALSE)), 
    
    # replace TRUE with NA in input vector  
    NA) 
  }
}


vdem_subset <- vdem_subset %>%
  group_by(country_id) %>%
  mutate(v2xca_academ_interpolated = na_interpolation2(v2xca_academ, option = "linear", maxgap = 5)) %>%
  drop_na(v2xca_academ_interpolated) %>%
  mutate(v2xca_academ_stock = annual_depricition_function(year, v2xca_academ_interpolated, 0.02)) %>%
  dplyr::select(country_id, country_name, year, v2xca_academ, v2xca_academ_stock, everything())

vdem_subset <- vdem_subset %>%
  group_by(country_id) %>%
  fill(e_gdppc,e_pop, .direction = "down")

#### Data Wrangling ####

vdem_subset_v14 <- vdem_subset %>%
  group_by(e_regionpol, year) %>%
  mutate(regionpol_EDI = mean(v2x_polyarchy, na.rm = TRUE)) %>%
  ungroup() %>%
  group_by(country_id) %>%
  mutate(e_gdppc_growth = ((e_gdppc - dplyr::lag(e_gdppc))/dplyr::lag(e_gdppc))*100) %>%
  mutate(e_gdppc_growth_roll5 = rollmean(e_gdppc_growth, k = 5, fill = NA)) %>%
  ungroup() %>%
  group_by(decline_episode_id) %>%
  mutate(decline_episode_duration =  year- first(year), 
         decline_episode_duration = ifelse(is.na(decline_episode_id), 0, decline_episode_duration)) %>%
  group_by(country_id) %>%
  mutate(group_id = cumsum(decline_episode != lag(decline_episode, default = FALSE))) %>% # group_id for each autocra_period
  group_by(country_id, group_id) %>%
  mutate(spell = year-first(year), 
         spell = ifelse(is.na(aut_ep_id), spell, 0))

vdem_subset_v14 <- vdem_subset_v14 %>%
  distinct(country_text_id, year, .keep_all = T)

summary(vdem_subset_v14$e_pop)
summary(vdem_subset_v14$e_gdppc)

vdem_subset_v14 <- vdem_subset_v14 %>%
  mutate(e_pop_log = log(e_pop))

summary(vdem_subset_v14$e_pop_log)

## lags for independent variables ##

vdem_subset_v14_noNA <- vdem_subset_v14 %>%
  mutate(year_1900 = year -1900, 
         v2lgbicam = ifelse(v2lgbicam >=1, 1, 0), 
         v2caprotac = ifelse(v2caprotac == 95, 0, 
                             ifelse(v2caprotac == 97, 0, 
                                    ifelse(v2caprotac == 99, 0, v2caprotac)))) %>%
  group_by(country_id) %>%
  mutate(v2xca_academ_stock_lag = lag(v2xca_academ_stock), 
         operate_high_simple_lag = lag(operate_high_simple), 
         edu_dep_lag = lag(edu_dep),
         e_gdppc_lag = lag(e_gdppc), 
         e_gdppc_growth_roll5_lag = lag(e_gdppc_growth_roll5), 
         regionpol_EDI_lag = lag(regionpol_EDI), 
         e_pop_log_lag = lag(e_pop_log), 
         v2canuni_lag = lag(log(v2canuni+1)), 
         v2caprotac_lag = lag(v2caprotac), 
         v2x_veracc_lag = lag(v2x_veracc), 
         v2x_diagacc_lag = lag(v2x_diagacc), 
         v2x_horacc_lag = lag(v2x_horacc), 
         v2lgbicam_lag = lag(v2lgbicam), 
         v2juhcind_lag = lag(v2juhcind), 
         v2jureview_lag = lag(v2jureview), 
         v2exl_legitideol_lag = lag(v2exl_legitideol), 
         v2exl_legitperf_lag = lag(v2exl_legitperf), 
         v2exl_legitratio_lag = lag(v2exl_legitratio), 
         v2cacamps_lag = lag(v2cacamps), 
         e_boix_regime_lag = lag(e_boix_regime), 
         v2petersch_lag = lag(v2petersch)      
  )

vdem_subset_v14_noNA <- vdem_subset_v14_noNA %>%
  mutate(region_factor = as.factor(e_regionpol_6C),
         region_factor = relevel(region_factor, ref=1), 
         spell = spell/100,
         spell_sq = spell^2/100,
         spell_cb = spell^3/100,
         year_sq = year_1900^2)

summary(vdem_subset_v14_noNA$v2caprotac_lag)
table(vdem_subset_v14_noNA$v2caprotac_lag)

vdem_subset_v14_noNA <- vdem_subset_v14_noNA %>%
  group_by(country_id) %>%
  fill(e_gdppc_growth_roll5_lag, .direction = "down")


## drop NA ## 
vdem_subset_v14_noNA <- vdem_subset_v14_noNA %>%
  filter(v2x_regime >= 2) %>%
  group_by(country_text_id) %>%
  drop_na(e_gdppc, e_gdppc_growth_roll5_lag, regionpol_EDI_lag, 
          e_pop_log, region_factor, year_1900, spell, country_id) 

summary(vdem_subset_v14_noNA)
length(unique(vdem_subset_v14_noNA$country_name)) # 115 countries 
min(vdem_subset_v14_noNA$year)
max(vdem_subset_v14_noNA$year)

################################################################################
################################################################################

#### TWFE regression ####

m1 <- feols(v2xca_academ ~ aut_ep_lag_1 + e_gdppc_lag + e_gdppc_growth_roll5_lag + e_pop_log_lag | 
              country_name + year, data = vdem_subset_v14_noNA, vcov = "cluster")

m2 <- feols(v2xca_academ ~ aut_ep_lag_1_3 + e_gdppc_lag + e_gdppc_growth_roll5_lag + e_pop_log_lag | 
              country_name + year, data = vdem_subset_v14_noNA, vcov = "cluster")

m3 <- feols(v2xca_academ ~ aut_ep_lag_1_5 + e_gdppc_lag + e_gdppc_growth_roll5_lag + e_pop_log_lag | 
              country_name + year, data = vdem_subset_v14_noNA, vcov = "cluster")

summary(m3)

## Extract models ##
texreg::wordreg(list(m1, m2, m3), 
                file = "outputs/Table_A6_Linear_Regression.docx", 
                custom.coef.names=c("Autocratization Episode t-1", 
                                    "GDP pc log",
                                    "GDP growth",
                                    "Population log",
                                    "Autocratization Episode t-1 to t-3", 
                                    "Autocratization Episode t-1 to t-5")
)


#### TWFE regression ####

m4 <- feols(v2xca_academ ~ aut_ep_lag_1_3 + e_gdppc_lag + e_gdppc_growth_roll5_lag + e_pop_log_lag +
              v2canuni_lag + v2caprotac_lag + v2jureview_lag | 
              country_name + year, data = vdem_subset_v14_noNA, vcov = "cluster")

m5 <- feols(v2xca_academ ~ aut_ep_lag_1_3 + e_gdppc_lag + e_gdppc_growth_roll5_lag + e_pop_log_lag +
              v2canuni_lag + v2caprotac_lag + v2jureview_lag + v2petersch_lag| 
              country_name + year, data = vdem_subset_v14_noNA, vcov = "cluster")

m6 <- feols(v2xca_academ ~ aut_ep_lag_1_3 + e_gdppc_lag + e_gdppc_growth_roll5_lag + e_pop_log_lag +
              operate_high_simple_lag + edu_dep_lag | 
              country_name + year, data = vdem_subset_v14_noNA, vcov = "cluster")

m7 <- feols(v2xca_academ ~ aut_ep_lag_1_3 + e_gdppc_lag + e_gdppc_growth_roll5_lag + e_pop_log_lag +v2cacamps_lag | 
              country_name + year, data = vdem_subset_v14_noNA, vcov = "cluster")

m8 <- feols(v2xca_academ ~ aut_ep_lag_1_3 + e_gdppc_lag + e_gdppc_growth_roll5_lag + e_pop_log_lag +
              v2x_veracc_lag + v2x_horacc_lag + v2x_diagacc_lag | 
              country_name + year, data = vdem_subset_v14_noNA, vcov = "cluster")

m9 <- feols(v2xca_academ ~ aut_ep_lag_1_3 + e_gdppc_lag + e_gdppc_growth_roll5_lag + e_pop_log_lag + v2xca_academ_stock_lag| 
              country_name + year, data = vdem_subset_v14_noNA, vcov = "cluster")

m10 <- feols(v2xca_academ ~ aut_ep_lag_1_3 + e_gdppc_lag + e_gdppc_growth_roll5_lag + e_pop_log_lag +
               v2canuni_lag + v2caprotac_lag + v2jureview_lag +
               operate_high_simple_lag + edu_dep_lag + 
               v2cacamps_lag + 
               v2x_veracc_lag + v2x_horacc_lag + v2x_diagacc_lag +  
               v2xca_academ_stock_lag | 
              country_name + year, data = vdem_subset_v14_noNA, vcov = "cluster")


## Extract models ##
texreg::wordreg(list(m4, m5, m6, m7, m8, m9, m10), 
                file = "outputs/Table_A7_Linear_Regression.docx", 
                custom.coef.names=c("Autocratization Episode t-1 to t-3", 
                                    "GDP pc log",
                                    "GDP growth",
                                    "Population log",
                                    "Number of Universities", 
                                    "Constitutional proctection of AF", 
                                    "Judicial review", 
                                    "Teritary Education Enrollment", 
                                    "University Operation: Private and Public", 
                                    "University Operation: Private", 
                                    "University Operation: Subnational", 
                                    "National-level Education Department", 
                                    "Political Polarization", 
                                    "Vertical Accounability", 
                                    "Horizontal Accountability", 
                                    "Diagonal Accounability", 
                                    "Academic Freedom Stock")
)



